Purpose:
Calculate and plot mutational signatures for all samples using COSMIC signatures and Alexandrov et al, 2013 mutational signatures.
Coming soon.
To run this from the command line, use:
Rscript -e "rmarkdown::render('analyses/mutational-signatures/mutational_signatures.Rmd',
clean = TRUE)"
This assumes you are in the top directory of the repository.
Import necessary functions.
# Magrittr pipe
`%>%` <- dplyr::`%>%`
# Import specialized functions
source(file.path("util", "mut_sig_functions.R"))
# Load this library
library(deconstructSigs)
Set up directory paths.
data_dir <- file.path("..", "..", "data")
results_dir <- "results"
plots_dir <- "plots"
figures_dir <- file.path("..", "..", "figures")
cosmic_plots <- file.path(plots_dir, "cosmic")
nature_plots <- file.path(plots_dir, "nature")
Make new directories for the results.
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
if (!dir.exists(cosmic_plots)) {
dir.create(cosmic_plots, recursive = TRUE)
}
if (!dir.exists(nature_plots)) {
dir.create(nature_plots, recursive = TRUE)
}
Read in the consensus MAF file.
# Declare file path for consensus file
consensus_file <- file.path(data_dir, "pbta-snv-consensus-mutation.maf.tsv.gz")
Read in the consensus MAF file.
# Read in the file
maf <- data.table::fread(consensus_file, data.table = FALSE)
Registered S3 method overwritten by 'R.oo':
method from
throw.default R.methodsS3
Read in the histology color palette.
histology_col_palette <- readr::read_tsv(
file.path(figures_dir, "palettes", "histology_color_palette.tsv")
) %>%
# We'll use deframe so we can use it as a recoding list
tibble::deframe()
Parsed with column specification:
cols(
color_names = col_character(),
hex_codes = col_character()
)
Set up gradient color palette for the bubble matrix plots.
gradient_col_palette <- readr::read_tsv(
file.path(figures_dir, "palettes", "gradient_color_palette.tsv")
)
Parsed with column specification:
cols(
color_names = col_character(),
hex_codes = col_character()
)
# Won't need NA color this time.
gradient_col_palette <- gradient_col_palette %>%
dplyr::filter(color_names != "na_color")
Read in the metadata and set it up with the color palette.
metadata_df <- readr::read_tsv(file.path(data_dir, "pbta-histologies.tsv"), guess_max = 10000) %>%
dplyr::rename(Tumor_Sample_Barcode = "Kids_First_Biospecimen_ID") %>%
dplyr::select("Tumor_Sample_Barcode", "experimental_strategy", "short_histology") %>%
# Easier to deal with NA short histologies if they are labeled something different
dplyr::mutate(short_histology = as.character(tidyr::replace_na(short_histology, "none"))) %>%
# Tack on the sample color using the short_histology column and a recode
dplyr::mutate(sample_color = dplyr::recode(short_histology,
!!!histology_col_palette))
Parsed with column specification:
cols(
.default = col_character(),
OS_days = col_double(),
age_last_update_days = col_double(),
normal_fraction = col_double(),
tumor_fraction = col_double(),
tumor_ploidy = col_double()
)
See spec(...) for full column specifications.
Read in this list so we can make sure we keep only primary tumors for the grouped bar plots.
ind_samples <- readr::read_tsv(file.path(
data_dir,
"independent-specimens.wgswxs.primary.tsv"
))
Parsed with column specification:
cols(
Kids_First_Participant_ID = col_character(),
Kids_First_Biospecimen_ID = col_character()
)
Read in the WGS and WXS regions so they can be used for the Mb denominator.
# Set up BED region files for TMB calculations
wgs_bed <- readr::read_tsv(file.path(data_dir, "WGS.hg38.strelka2.unpadded.bed"),
col_names = FALSE
)
Parsed with column specification:
cols(
X1 = col_character(),
X2 = col_double(),
X3 = col_double()
)
wxs_bed <- readr::read_tsv(file.path(data_dir, "WXS.hg38.100bp_padded.bed"),
col_names = FALSE
)
Parsed with column specification:
cols(
X1 = col_character(),
X2 = col_double(),
X3 = col_double()
)
# Calculate size of genome surveyed
# These files are BED files where the third column is the End position and
# the second column is the Start position.
# So End - Start gives the size of each range. Sum the gives the total size in bp.
wgs_size <- sum(wgs_bed[, 3] - wgs_bed[, 2])
wxs_size <- sum(wxs_bed[, 3] - wxs_bed[, 2])
Determine how many mutations we have per sample.
mut_per_sample <- maf %>%
dplyr::group_by(Tumor_Sample_Barcode) %>%
dplyr::tally() %>%
dplyr::arrange(n)
summary(mut_per_sample$n)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 81 255 1309 802 139938
Graph this.
ggplot2::ggplot(mut_per_sample, ggplot2::aes(x = n, geom = "density")) +
ggplot2::geom_density() +
ggplot2::theme_classic()
Make mutation data into deconstructSigs input format.
# Convert to deconstructSigs input
sigs_input <- mut.to.sigs.input(
mut.ref = maf,
sample.id = "Tumor_Sample_Barcode",
chr = "Chromosome",
pos = "Start_Position",
ref = "Reference_Allele",
alt = "Allele",
bsg = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
)
Warning in mut.to.sigs.input(mut.ref = maf, sample.id = "Tumor_Sample_Barcode", : Some samples have fewer than 50 mutations:
BS_4XPPZTGG, BS_3J4T2YYW, BS_541KKASA, BS_5FBN4PPQ, BS_WD7AQV83, BS_YEWX69X4, BS_QBZDQX7A, BS_T8C13KNH, BS_5TT6TT4K, BS_V2MDX7HG, BS_GRECE8Q9, BS_KKWQW01N, BS_MBYEK6HV, BS_78TQS4Y3, BS_8BM1WTGR, BS_M9YZK3SP, BS_QYWGB52C, BS_XDY9BYRK, BS_AR2BH0Z4, BS_WD37M8SD, BS_FBJ516WW, BS_6HE8H9HY, BS_KSHETTQC, BS_9F2J6VS1, BS_FJEBFQHG, BS_AM2TZPNS, BS_QSMFVHSB, BS_P7VR731D, BS_CRKBDAYZ, BS_ZHJFX6C9, BS_CWQ35N2K, BS_SSYNTXDC, BS_CQ8C6X4X, BS_XVNNWN2Z, BS_B558FJXC, BS_N8D9FH6M, BS_DMBNB0EP, BS_6B62YT97, BS_ACTR13BQ, BS_11322HD1, BS_MSGYZ69T, BS_HYGK88B0, BS_VXNT1TDB, BS_WSFB4SA2, BS_06GPPDHZ, BS_74SJ7YV4, BS_ZKFEMJZ8, BS_H83DTMT2, BS_KM878Y43, BS_5Z4XQC9X, BS_H3TYF6GD, BS_TYFD8ZMG, BS_E5B2YZJH, BS_DT92H1MJ, BS_EKR4NSNW, BS_AEEKJ1QK, BS_NMJPX8VQ, BS_1RF75MK2, BS_YZ2YNEJJ, BS_328C4BW5, BS_EDCQ4M0R, BS_W6B48DD5, BS_52ETE050, BS_XPZAD9PP, BS_PZDTK87A, BS_E7HEQZ1K, BS_J037K4BM, BS_41SWNWAG, BS_R922P3XJ, BS_FG649AWK, BS_W50WEJE7, BS_886M7JMG, BS_7KR13R3P, BS_CN7PB0DN, BS_F7KYPE79, BS_DAEM3WRX, BS_WXKCR17Q, BS_B1X2SB4P, BS_YPCTK927, BS_HNE6BPMH, BS_CDKC1E2B, BS_R3WB0PP7, BS_QD22MJJ5, BS_AGAW7M7W, BS_1RTE2KEX, BS_A6YJ3WG2, BS_XWQTX5HH, BS_Q5XKTB1V, BS_402W79TS, BS_QSDKJM8T, BS_WYTDVC0Y, BS_N2JD3VX6, BS_NTXHP7QR, BS_2KMB51YF, BS_C5RDMCCS, BS_AXW0XBH8, BS_SDT1J3A7, BS_6KEYD1S4, BS_6GTKRGNK, BS_8JFMP1T1, BS_EHBH0NYX, BS_S88G7BXH, BS_BSM3ZHW4, BS_Q807ENGY, BS_6Y7FX7QJ, BS_D3XVXF05, BS_RR3DKGKY, BS_9QMVZ4C0, BS_9W6FK6X4, BS_JJNSP29S, BS_BVD7PWP5, BS_MS87PMR7, BS_7Y5E5KQW, BS_2VHFP7M8, BS_MHT5C2AP, BS_W8DA9AXT, BS_7BNQBB26, BS_MWXDJFWW, BS_B6KCJF2B, BS_74TZJTW0, BS_93BV8AY9, BS_93B38HPS, BS_YVFFHJ1Q, BS_6VTQ3W4M, BS_5S8VXASX, BS_SE82EQF8, BS_R1RMKH1B, BS_CFEWVDG1, BS_WJ7GMQTF, BS_YZ0P5E8H, BS_XKXDH0YJ, BS_XCFNEZ8E, BS_N0Z6V2YG, BS_947CK40E, BS_9E6Q11G2, BS_NHMPQ8Q1, BS_SYWRCE3Z, BS_ZP89F1JY, BS_B4DY7ET3, BS_4M0ZMCDC, BS_BS5X4H0Y, BS_94AP61Q5, BS_PDRNE606, BS_C0YDDQKB, BS_KV3XF9WV, BS_2JDVY4F7, BS_6DT506HY, BS_JCWD4DMA, BS_1135HC0V, BS_1YTHM07J, BS_895FJ34T, BS_5CCAC8TZ, BS_AXM96K20, BS_797V83YM, BS_V3ZNXRHK, BS_EQE19CB0, BS_71HPXFE7, BS_ZJT1REVB, BS_CPZZR2R7, BS_M1YHV93N, BS_G61CN2V0, BS_GAA327FP, BS_NS8G0465, BS_8CA214YR, BS_PMSZKZP7, BS_BZWM455T, BS_2HDEZFTW, BS_1RFBH1SP, BS_922YMFYK, BS_8Q8CAY84, BS_6687ZNGS, BS_JJWC13GG, BS_K07KNTFY, BS_9DN4QR6E, BS_2E81A3FT, BS_SCQNN2A2, BS_MCM78YPC, BS_5Y2Q63Y4, BS_AQMKA8NC, BS_2VSN5YC5, BS_969K7ZM1, BS_V3SC6NWH, BS_NDSWS9TS, BS_NEX92KQH, BS_TPX7YY57, BS_5J5VH3X0, BS_N5VYY66W, BS_TX2WGF8K, BS_KPG4WCHS, BS_FWP8ZA4K, BS_HM5N78GV, BS_K049HVGR, BS_K14VJ1E3, BS_FWCKP4X1, BS_89RCN9PF, BS_Z14Z2JMW, BS_Y0DGBKTY, BS_PVZJCXS1, BS_0QEBZX3M, BS_N8WXTFN4, BS_ETTZM63Y, BS_CX6KACX6, BS_9R82A3VT
Add total mutations per sample.
# Count the total number of signature mutations for each sample
total_muts <- apply(sigs_input, 1, sum)
Get list of tumor sample ids.
tumor_sample_ids <- maf %>%
dplyr::filter(Tumor_Sample_Barcode %in% rownames(sigs_input)) %>%
dplyr::distinct(Tumor_Sample_Barcode) %>%
dplyr::pull(Tumor_Sample_Barcode)
Get COSMIC signatures for each sample. This step will take some time.
sample_sigs_cosmic <- lapply(tumor_sample_ids, function(sample_id) {
# Determine the signatures contributing to the sample
whichSignatures(
tumor.ref = sigs_input,
signatures.ref = signatures.cosmic,
sample.id = sample_id,
contexts.needed = TRUE
)
})
# Bring along the names
names(sample_sigs_cosmic) <- tumor_sample_ids
Get Alexandrov et al, 2013 signatures for each sample.
sample_sigs_nature <- lapply(tumor_sample_ids, function(sample_id) {
# Determine the signatures contributing to the sample
whichSignatures(
tumor.ref = sigs_input,
signatures.ref = signatures.nature2013,
sample.id = sample_id,
contexts.needed = TRUE
)
})
# Bring along the names
names(sample_sigs_nature) <- tumor_sample_ids
sample_mut_sig_plot(
sample_sigs_cosmic,
label = "cosmic",
output_dir = file.path(cosmic_plots, "individual_mutation_sig")
)
sample_mut_sig_plot(
sample_sigs_nature,
label = "nature",
output_dir = file.path(nature_plots, "individual_mutation_sig")
)
Do this for COSMIC mutation signatures.
# Calculate mutations per signature
cosmic_sigs_df <- calc_mut_per_sig(
sample_sigs_cosmic,
muts_per_sample = total_muts,
wgs_genome_size = wgs_size,
wxs_exome_size = wxs_size,
metadata = metadata_df
)
Using Tumor_Sample_Barcode, experimental_strategy, short_histology, sample_color as id variables
# Write this to a file but drop the color column
cosmic_sigs_df %>%
dplyr::select(-sample_color) %>%
readr::write_tsv(file.path(results_dir, "cosmic_signatures_results.tsv"))
# Print out a preview
cosmic_sigs_df
Do this for Alexandrov et al, 2013 mutation signatures.
# Calculate mutations per signature
nature_sigs_df <- calc_mut_per_sig(
sample_sigs_nature,
muts_per_sample = total_muts,
wgs_genome_size = wgs_size,
wxs_exome_size = wxs_size,
metadata = metadata_df
)
Using Tumor_Sample_Barcode, experimental_strategy, short_histology, sample_color as id variables
# Write this to a file but drop the color column
nature_sigs_df %>%
dplyr::select(-sample_color) %>%
readr::write_tsv(file.path(results_dir, "nature_signatures_results.tsv"))
# Print out a preview
nature_sigs_df
bubble_matrix_plot(cosmic_sigs_df,
label = "COSMIC Signatures",
color_palette = gradient_col_palette$hex_codes
)
Warning: Removed 558 rows containing missing values (geom_point).
ggplot2::ggsave(
file.path(cosmic_plots, "bubble_matrix_cosmic_mutation_sig.png"),
width = 30, height = 20, units = "cm")
Warning: Removed 558 rows containing missing values (geom_point).
bubble_matrix_plot(nature_sigs_df,
label = "Alexandrov et al, 2013 signatures",
color_palette = gradient_col_palette$hex_codes)
Warning: Removed 452 rows containing missing values (geom_point).
ggplot2::ggsave(
file.path(nature_plots, "bubble_matrix_nature_mutation_sig.png"),
width = 30, height = 20, units = "cm")
Warning: Removed 452 rows containing missing values (geom_point).
We will make these plots for primary tumor samples only. Let’s make these for COSMIC mutation signatures first.
# Keep only primary tumors
cosmic_sigs_primary <- cosmic_sigs_df %>%
dplyr::filter(Tumor_Sample_Barcode %in% ind_samples$Kids_First_Biospecimen_ID)
# Make grouped bar plots
lapply(unique(cosmic_sigs_primary$short_histology),
grouped_sig_barplot,
sig_num_df = cosmic_sigs_primary,
output_dir = file.path(cosmic_plots, "signature_grouped_barplots"),
label = "cosmic"
)
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Make these plots for Alexandrov et al, 2013 signatures.
# Keep only primary tumors
nature_sigs_primary <- nature_sigs_df %>%
dplyr::filter(Tumor_Sample_Barcode %in% ind_samples$Kids_First_Biospecimen_ID)
# Make grouped bar plots
lapply(unique(nature_sigs_primary$short_histology),
grouped_sig_barplot,
sig_num_df = nature_sigs_primary,
output_dir = file.path(nature_plots, "signature_grouped_barplots"),
label = "nature"
)
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] deconstructSigs_1.8.0
loaded via a namespace (and not attached):
[1] BSgenome.Hsapiens.UCSC.hg38_1.4.1 Rcpp_1.0.1
[3] lattice_0.20-38 tidyr_0.8.3
[5] Rsamtools_2.2.3 Biostrings_2.54.0
[7] assertthat_0.2.1 digest_0.6.20
[9] R6_2.4.0 GenomeInfoDb_1.22.1
[11] plyr_1.8.4 stats4_3.6.0
[13] evaluate_0.14 ggplot2_3.2.0
[15] pillar_1.4.2 zlibbioc_1.32.0
[17] rlang_0.4.0 lazyeval_0.2.2
[19] data.table_1.12.2 S4Vectors_0.24.4
[21] R.utils_2.9.0 R.oo_1.22.0
[23] Matrix_1.2-17 rmarkdown_1.13
[25] labeling_0.3 BiocParallel_1.20.1
[27] readr_1.3.1 stringr_1.4.0
[29] RCurl_1.95-4.12 munsell_0.5.0
[31] DelayedArray_0.12.3 compiler_3.6.0
[33] rtracklayer_1.46.0 xfun_0.8
[35] pkgconfig_2.0.2 BiocGenerics_0.32.0
[37] base64enc_0.1-3 htmltools_0.3.6
[39] tidyselect_0.2.5 SummarizedExperiment_1.16.1
[41] tibble_2.1.3 GenomeInfoDbData_1.2.2
[43] IRanges_2.20.2 matrixStats_0.54.0
[45] XML_3.98-1.20 crayon_1.3.4
[47] dplyr_0.8.3 GenomicAlignments_1.22.1
[49] bitops_1.0-6 R.methodsS3_1.7.1
[51] grid_3.6.0 jsonlite_1.6
[53] gtable_0.3.0 magrittr_1.5
[55] scales_1.0.0 stringi_1.4.3
[57] XVector_0.26.0 reshape2_1.4.3
[59] tools_3.6.0 forcats_0.4.0
[61] BSgenome_1.54.0 Biobase_2.46.0
[63] glue_1.3.1 purrr_0.3.2
[65] hms_0.4.2 parallel_3.6.0
[67] yaml_2.2.0 colorspace_1.4-1
[69] GenomicRanges_1.38.0 knitr_1.23